survreg and coxph ObjectsThis page serves as supplementary material to the research paper by Li, L., Wu, T., & Feng, C. (2021). Within the aforementioned publication, we introduced a residual termed “normalized randomized survival probabilities (Z-residual) residuals.” Subsequently, for the purposes of brevity and enhanced interpretability, this nomenclature was revised. The residuals are now referred to as “Z-residuals”, reflecting the conventional use of the letter Z to denote a random variable following a standard normal distribution.
The functions available on this page may still be employed for the
computation of Z-residuals for survreg and
coxph objects. However, the authors have also released an R
package on GitHub, titled Zresidual, which is designed to
calculate Z-residuals for a broader array of statistical models. Further
details are available in the references cited below.
Wu, T., Feng, C., Li, L. (2024). Cross-validatory Z-Residual for Diagnosing Shared Frailty Models. The American Statistician 0, 1–17.
[PDF]; [Free Reprints]; [slides]; [Z-residual on Github]; [Demo]
Wu, T., Li, L., Feng, C. (2024). Z-residual diagnostic tool for assessing covariate functional form in shared frailty models. Journal of Applied Statistics 0, 1–31.
Li, L., Wu, T., Feng, C. (2021). Model diagnostics for censored regression via randomized survival probabilities. Statistics in Medicine 40, 1482–1497.
[PDF]; [R Functions and Demonstration]; [slides-1]; [slides-2];
Let \(T_i\) be a possibly right-censored observation, \(d_i\) be the indicator for being uncensored, and \(S_i(\cdot)\) be the survival function of original uncensored \(T^*_i\) given covariate \(x_i\) and parameters. The randomized survival probability (RSP) for \(T_{i}\) is defined as: \[\begin{equation} S_i^{R}(T_i, d_{i}, U_{i}) = \left\{ \begin{array}{rl} S_{i}(T_{i}), & \text{if $T_i$ is uncensored, i.e., $d_{i}=1$,}\\ U_{i}\,S_{i}(T_{i}), & \text{if $T_i$ is censored, i.e., $d_i=0$.} \end{array} \right. \label{rsp} \end{equation}\] where \(U_i\) is a uniform random number on \((0, 1)\). In Wu, Feng, and Li (2019+), we show that RSPs are uniformly distributed on \((0,1)\) under the true model for \(T^*_i\). Therefore, they can be transformed into residuals with any desired distribution. Z residuals are transformed RSPs with the standard normal quantile: \[\begin{equation} r_{i}^{\mbox{Z}}(T_i, d_{i}, U_i)=-\Phi^{-1} (S_{i}^R(T_i, d_{i}, U_i)) \end{equation}\]
Z-residuals are approximately standard normally distributed under the true model with estimated parameters. The approximate normality holds for any censoring distribution. Z-residuals can be used to check the goodness-of-fit of \(S_i(\cdot)\) quantitatively by applying SW normality test to Z-residuals, and graphically by visualizing their QQplot. Scatterplots of Z-residuals may reveal the model departure nature.
Please note that in the original publication by Li et al. (2021), a negative sign was not incorporated into the definition of the Z-residual. Upon further consideration, it was determined that the addition of a negative sign is more appropriate. This modification ensures that larger residuals correspond to larger observed values, which is consistent with the convention for Pearson’s residuals.
survreg and
coxph Objects#input: survreg_fit is a survreg object
resid_survreg<-function(survreg_fit)
{
distr<-survreg_fit$dist
y<- survreg_fit$y
m <- nrow (y)
parms<- as.numeric(survreg_fit[["parms"]])
alpha_hat<-1/survreg_fit$scale
if (distr %in% c("weibull","exponential","logistic","lognormal",
"loglogistic","gaussian", "loggaussian","rayleigh"))
{
SP<-1-(psurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr))
haz <- dsurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr)/SP
}else if (distr %in% "t")
{
SP<-1-(psurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr,
parms = parms))
haz <- dsurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr,
parms = parms)/SP
}else stop ("The distribution is not supported")
censored <- which(as.data.frame(as.matrix(y))[,-1]==0)
n.censored <- length(censored)
# Z-residuals
RSP <- SP
RSP[censored] <- RSP[censored]*runif(n.censored)
zresid <- -qnorm(RSP)
# Unmodified CS residual
ucs<- -log(SP)
# Modified CS residual
MSP<- SP
MSP[censored] <- SP[censored]/exp(1)
mcs <- -log(MSP)
nmsp <- -qnorm (MSP)
#Martingale Residual
martg<- as.data.frame(as.matrix(survreg_fit$y))[,-1]- ucs
#Deviance Residual
dev<- sign(martg)* sqrt((-2)*(martg+as.data.frame(as.matrix(survreg_fit$y))[,-1]*
log(as.data.frame(as.matrix(survreg_fit$y))[,-1]-martg)))
zresid.sw.pvalue<-shapiro.test(zresid)$p.value
#hazard function
haz_fn<- haz
list(RSP=RSP,zresid=zresid,zresid.sw.pvalue=zresid.sw.pvalue,
ucs=ucs, mcs=mcs,martg=martg, dev=dev, nmsp=nmsp,haz_fn=haz_fn)
}
# outputs:
## RSP --- Randomized Survival Probabilities
## zresid --- Normalized RSP
## zresid.sw.pvalue --- GOF test p-values by applying SW test to Z-residuals
## ucs --- unmodified CS residuals
## mcs --- modified CS residuals
## nmsp --- normalized modified SP
## martg --- Martingale residuals
## dev --- Deviance residuals
## haz_fn --- hazard function of cs residuals
#input: coxfit_fit is a coxph object
resid_coxph<-function(coxfit_fit)
{
y<- coxfit_fit$y
m <- nrow (y)
mre <- resid(coxfit_fit, type="martingale")
dre <- resid(coxfit_fit, type="deviance")
#Unmodified Cox-Snell residual
ucs <- as.data.frame(as.matrix(y))[,-1] - mre
#Survival Function
SP<- exp(-ucs)
censored <- which(as.data.frame(as.matrix(y))[,-1]==0)
n.censored <- length(censored)
#NMSP residual (normal-transformed modified survival prob)
MSP<- SP
MSP[censored] <- SP[censored]/exp(1)
mcs <- -log(MSP)
nmsp<- -qnorm(MSP)
#Z-residuals
RSP <- SP
RSP[censored] <- RSP[censored]*runif(n.censored)
zresid <- -qnorm(RSP)
zresid.sw.pvalue<-shapiro.test(zresid)$p.value
list(zresid=zresid,RSP=RSP,zresid.sw.pvalue=zresid.sw.pvalue,
ucs=ucs,mcs=mcs,nmsp=nmsp)
}
# outputs:
## RSP --- Randomized Survival Probabilities
## zresid --- Normalized RSP
## zresid.sw.pvalue --- GOF test p-values by applying SW test to Z-residuals
## ucs --- unmodified CS residuals
## mcs --- modified CS residuals
## nmsp --- normalized modified SP
## martg --- Martingale residuals
## dev --- Deviance residuals
## haz_fn --- hazard function of cs residuals
Copy the above R functions into your R enviroment.
#simulated data:t from weibull distrbution,c from exponentail distrbutions
set.seed(1)
rexp2 <- function(n, rate){ if (rate==0) rep(Inf,n) else rexp(n=n, rate = rate)}
simulated_data<- function(n,beta0 , beta1 , alpha, mean.censor)
{
x <- rbinom(n, size = 1, p = 0.5)
t0<- rexp2(n, rate= 1/mean.censor)
survreg_sim_data <- rsurvreg( n, mean = beta0 + beta1 * x,
scale=1/alpha, distribution='weibull')
t <- pmin(survreg_sim_data, t0)
d <- as.numeric(t0>= survreg_sim_data )
data_form<- data.frame(survreg_sim_data,t0,t,d,x)
out_r<-list(data_form=data_form, alpha=alpha, beta0=beta0, beta1=beta1)
return (out_r)
}
library("foreach")
library("survival")
n<-2000
beta0<-2
beta1<-1
alpha<-1.7
mean.censor<-14
#nrep is preset to a number
foreach (j = 1:nrep) %do%
{
# simulating a dataset
out_r<- simulated_data(n=n,beta0=beta0,beta1=beta1,
alpha=alpha, mean.censor=mean.censor)
simulated_data_random<-out_r$data_form
#checking censoring rate
table(simulated_data_random$d)
#fit AFT model
survreg_fit_t <- survreg(Surv(out_r[[1]]$t,
out_r[[1]]$d) ~ out_r[[1]]$x,dist="weibull")
survreg_fit_w <- survreg(Surv(out_r[[1]]$t,
out_r[[1]]$d) ~ out_r[[1]]$x,dist="lognormal")
# compute residuals
true_model <- resid_survreg(survreg_fit_t)
wrong_model<- resid_survreg(survreg_fit_w)
rsp.t <- true_model$RSP
zresid.t<- true_model$zresid
rsp.w <- wrong_model$RSP
zresid.w<- wrong_model$zresid
t<-seq(0, 50, length=n)
gp1<- which(out_r[[1]]$x==1&out_r[[1]]$d==1)
gp2<- which(out_r[[1]]$x==1&out_r[[1]]$d==0)
gp3<- which(out_r[[1]]$x==0&out_r[[1]]$d==1)
gp4<- which(out_r[[1]]$x==0&out_r[[1]]$d==0)
s1.t<-1-(psurvreg(t, mean=sum(survreg_fit_t$coefficients),
scale=survreg_fit_t$scale, distribution="weibull"))
s0.t<-1-(psurvreg(t, mean=survreg_fit_t$coefficients[[1]],
scale=survreg_fit_t$scale, distribution="weibull"))
s1.w<-1-(psurvreg(t, mean=sum(survreg_fit_w$coefficients),
scale=survreg_fit_w$scale, distribution="lognormal"))
s0.w<-1-(psurvreg(t, mean=survreg_fit_w$coefficients[[1]],
scale=survreg_fit_w$scale, distribution="lognormal"))
par(mfrow = c(2,2),mar=c(4,4,2,2))
#The plot of true model
plot(t, s1.t,type="l", ylab="S(t)",xlab="Time",
main="RSPs, True Model",ylim=c(0,1.0),xlim=c(0,40))
lines(t, s0.t)
## choose a random sample of observations for display
sp <- sample(1:n,400)
s.gp1 <- gp1[which(gp1 %in% sp)]
s.gp2 <- gp2[which(gp2 %in% sp)]
s.gp3 <- gp3[which(gp3 %in% sp)]
s.gp4 <- gp4[which(gp4 %in% sp)]
points(out_r[[1]]$t[s.gp1], rsp.t[s.gp1],col="green",pch=0)
points(out_r[[1]]$t[s.gp2],rsp.t[s.gp2],col="red",pch=8)
points(out_r[[1]]$t[s.gp3],rsp.t[s.gp3],pch=2,col="green")
points(out_r[[1]]$t[s.gp4],rsp.t[s.gp4],pch=1,col="red")
legend ("topright",
legend = c(expression('Uncensored,x'['i']*"=1"),
expression('Censored,x'['i']*'=1'),
expression('Uncensored,x'['i']*'=0'),
expression('Censored,x'['i']*'=0')),
pch=c(0,8,2,1), col = c(3,2,3,2))
hist(rsp.t,xlab="Randomized Survival Probability",main="Histogram of RSPs, True Model")
#The plot of wrong model
plot(t, s1.w,type="l",ylab="S(t)",xlab="Time",main="RSPs, Wrong Model",
ylim=c(0,1.0),xlim=c(0,40))
lines(t, s0.w)
sp <- sample(1:n,400)
s.gp1 <- gp1[which(gp1 %in% sp)]
s.gp2 <- gp2[which(gp2 %in% sp)]
s.gp3 <- gp3[which(gp3 %in% sp)]
s.gp4 <- gp4[which(gp4 %in% sp)]
points(out_r[[1]]$t[s.gp1],rsp.w[s.gp1],col="green",pch=0)
points(out_r[[1]]$t[s.gp2],rsp.w[s.gp2],col="red",pch=8)
points(out_r[[1]]$t[s.gp3],rsp.w[s.gp3],pch=2,col="green")
points(out_r[[1]]$t[s.gp4],rsp.w[s.gp4],pch=1,col="red")
legend ("topright", legend = c(expression('Uncensored,x'['i']*"=1"),
expression('Censored,x'['i']*'=1'),
expression('Uncensored,x'['i']*'=0'),
expression('Censored,x'['i']*'=0')),
pch=c(0,8,2,1), col = c(3,2,3,2)
)
hist(rsp.w,xlab="Randomized Survival Probability",
main="Histogram of RSPs, Wrong Model")
}
#simulated data:t from weibull distrbution,c from exponentail distrbutions
set.seed(1)
rexp2 <- function(n, rate){ if (rate==0) rep(Inf,n) else rexp(n=n, rate = rate)}
simulated_data<- function(n,beta0 , beta1 , alpha, mean.censor)
{
x <- rbinom(n, size = 1, p = 0.5)
t0<- rexp2(n, rate= 1/mean.censor)
survreg_sim_data <- rsurvreg( n, mean = beta0 + beta1 * x,
scale=1/alpha, distribution='weibull')
t <- pmin(survreg_sim_data, t0)
d <- as.numeric(t0>= survreg_sim_data )
data_form<- data.frame(survreg_sim_data,t0,t,d,x)
out_r<-list(data_form=data_form, alpha=alpha, beta0=beta0, beta1=beta1)
return (out_r)
}
library("foreach")
library("survival")
n<- 800
beta0<-2
beta1<-1
alpha<-2
mean.censor<-14.6
#nrep is preset to a number
foreach (j = 1:nrep) %do%
{
# simulating a dataset
out_r<- simulated_data(n=n,beta0=beta0,beta1=beta1,
alpha=alpha, mean.censor=mean.censor)
simulated_data_random<-out_r$data_form
#checking censoring rate
table(simulated_data_random$d)
#fit AFT model
true_model <- survreg(Surv(out_r[[1]]$t, out_r[[1]]$d) ~ out_r[[1]]$x,dist="weibull")
wrong_model <- survreg(Surv(out_r[[1]]$t, out_r[[1]]$d) ~ out_r[[1]]$x,dist="lognormal")
zresid.t<-resid_survreg(true_model)
zresid.w<-resid_survreg(wrong_model)
#the cumulative hazard function estimated by Kaplan-Meier method of CS residuals
km.ucs.t <- survfit(Surv(zresid.t$ucs, out_r[[1]]$d)~1,type='fleming')
id.ucs.t<-order(zresid.t$ucs)
km.ucs.w <- survfit(Surv(zresid.w$ucs, out_r[[1]]$d)~1,type='fleming')
id.ucs.w<-order(zresid.w$ucs)
#The plot of true model
par(mfrow = c(2,3),mar=c(4,4,2,2))
resid.lim <- c(-1,1) * max(range(abs(zresid.t$zresid),abs(zresid.w$zresid)),5)
cs.lim <- c(0,max(6,range(km.ucs.t$cumhaz,km.ucs.w$cumhaz)))
plot(zresid.t$zresid,ylab="Z-residuals",col=out_r[[1]]$d+2,pch=out_r[[1]]$d+1,
ylim=resid.lim, main="True Model, Z-residuals plot")
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),
col = c(3,2), cex=1)
qqnorm(zresid.t$zresid,xlab="Theoretical Quantiles", ylab="Sample Quantiles",
main= paste0("True Model, QQ plot, SW p-value=",
sprintf("%4.3f",zresid.t$zresid.sw.pvalue)),
xlim=resid.lim, ylim = resid.lim)
abline(a=0, b=1)
plot(km.ucs.t, fun="cumhaz", xlab="Unmodified Cox-Snell Residuals",
ylab="Cumulative Hazard Function",
main="True Model, Cum. Hazard of CS Residuals",
xlim=c(0,6), ylim = c(0,6))
abline(a=0, b=1, col="blue", lty=2)
points(km.ucs.t$time, -log(km.ucs.t$surv), col =out_r[[1]]$d[id.ucs.t]+2,
pch=out_r[[1]]$d[id.ucs.t]+1)
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),
col = c(3,2), cex=1)
#The plot of wrong model
plot(zresid.w$zresid,ylab="Z-residuals",col=out_r[[1]]$d+2,pch=out_r[[1]]$d+1,
ylim=resid.lim, main="Wrong Model, Z-residuals plot")
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),col = c(3,2), cex=1)
qqnorm(zresid.w$zresid,xlab="Theoretical Quantiles", ylab="Sample Quantiles",
main=paste0("Wrong Model, QQ plot, SW p-value=",
sprintf("%4.3f",zresid.w$zresid.sw.pvalue)),
xlim=resid.lim, ylim = resid.lim)
abline(a=0,b=1)
plot(km.ucs.w, fun="cumhaz", xlab=("Unmodified Cox-Snell Residuals"),
ylab=("Cumulative Hazard Function"),
main="Wrong Model, Cum. Hazard of CS Residuals",
xlim=c(0,6), ylim = c(0,6))
abline(a=0, b=1, col="blue", lty=2)
points(km.ucs.w$time, -log(km.ucs.w$surv), col =out_r[[1]]$d[id.ucs.w]+2,
pch=out_r[[1]]$d[id.ucs.w]+1)
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),col = c(3,2), cex=1)
}